Load libraries and DE genes

library(ggplot2)
library(readxl)
library(ggrepel)
library(kableExtra)
library(tidyverse)
library(org.Hs.eg.db)
library(clusterProfiler)
library(enrichplot)
library(msigdbr)
library(ggplot2)
hs=get("org.Hs.eg.db")

MEDIAsave="/home/mclaudia/MEDIA Project/Codici_finali/"
MEDIAgraph="/home/mclaudia/MEDIA Project/Codici_finali/Paper_figures/"

load(paste0(MEDIAsave,"DE_DESeq2_SCell.RData"))
singleCell <- res3
singleCell$gene_name <-rownames(singleCell)

load(paste0(MEDIAsave,"genide-bulk-unpaired_DE.RData"))

unpaired <-OAvsnonOA

load(paste0(MEDIAsave,"results_DE_EGApaired.RData"))
paired <-df.res

load(paste0(MEDIAsave,"DEgenes_from_validation.RData"))
valid <-dff

rm(df.res,OAvsnonOA,res3)

up_genes <- read_excel(paste0(MEDIAsave,"up_dw_genes_consensus.xlsx"), sheet=1)
dw_genes <- read_excel(paste0(MEDIAsave,"up_dw_genes_consensus.xlsx"), sheet=2)

Pre-process dataset for the Enrichment Bubble Plot construction

riskGenes <-c("DNER","TNFSF11","LOXL3","THBS3","ASPN","DYSF","TNNT3","HTRA1")

ddsc <-singleCell$gene_name[grep("\\.",singleCell$gene_name)]
ss=ddsc[-grep("^AP0.+|^RP.+-|^CT.-.+|^AC.+|^AL.|^GS1-.+|^XX.+|^LL.+",ddsc)]

singleCell <-singleCell[!singleCell$gene_name %in% c("CRYBG3.1", "SRSF10.1"),]
ss <-ss[!ss%in%c("CRYBG3.1", "SRSF10.1")]

singleCell[singleCell$gene_name %in% ss,"gene_name"] <- gsub("\\..*","",ss)
rm(ss,ddsc)

singleCell <-singleCell[,c(2,5,6)]
colnames(singleCell) <- c("log2FC","p_adj","gene_name")
singleCell$class <-"Dataset 1"

paired <-paired[,c(3,7,8)]
colnames(paired) <- c("log2FC","p_adj","gene_name")
paired$class <-"Dataset 2"

unpaired <-unpaired[,c(3,7,8)]
colnames(unpaired) <- c("log2FC","p_adj","gene_name")
unpaired$class <-"Dataset 3"

valid <-valid[,c(2,3,1)]
colnames(valid) <- c("log2FC","p_adj","gene_name")
valid$class <-"Dataset 4/Validation"

total_genes <-rbind(singleCell, paired, unpaired, valid)
rownames(total_genes) <-paste0(1:nrow(total_genes))

total_genes[1:10,]
##       log2FC        p_adj gene_name     class
## 1  14.356062 1.319705e-73    COL1A1 Dataset 1
## 2   1.605177 1.141659e-64    ABI3BP Dataset 1
## 3   2.863385 2.653574e-64     TGFBI Dataset 1
## 4   1.685722 8.423114e-60    TSPAN2 Dataset 1
## 5   2.232388 8.576118e-53    PDLIM7 Dataset 1
## 6  12.742026 3.912155e-49     CRLF1 Dataset 1
## 7   1.863951 7.741079e-48      MT1G Dataset 1
## 8   2.173891 1.781909e-47    CRTAC1 Dataset 1
## 9  -1.184766 1.076830e-42     KLF10 Dataset 1
## 10 11.641215 4.644320e-41     CLIC3 Dataset 1

Annotation

total_genes$col <-"gray50"
total_genes[total_genes$log2FC <= -1.5 & total_genes$class %in% c("Dataset 1","Dataset 3","Dataset 4/Validation"), "col"] <- "springgreen3"
total_genes[total_genes$log2FC <= -0.8 & total_genes$class=="Dataset 2", "col"] <- "springgreen3"


total_genes[total_genes$log2FC >= 1.5 & total_genes$class %in% c("Dataset 1","Dataset 3","Dataset 4/Validation"), "col"] <- "red"
total_genes[total_genes$log2FC >= 0.8 & total_genes$class=="Dataset 2", "col"] <- "red"

total_genes[total_genes$gene_name %in% c(up_genes$gene,dw_genes$gene), "col"] <- "blue"
total_genes[total_genes$p_adj > 0.05, "col"] <- "gray50"

total_genes$label <- NA
total_genes[total_genes$gene_name %in% riskGenes, "label"] <- total_genes[total_genes$gene_name %in% riskGenes,"gene_name"]


total_genes$Shape <- "unselected"
total_genes[total_genes$gene_name %in% riskGenes, "Shape"] <- "selected risk.sc."

total_genes$ss <-2
total_genes[total_genes$gene_name %in% riskGenes, "ss"] <- 4

Volcano Plots

ggplot(data=total_genes, aes(x=log2FC, y=-log10(p_adj), color=col,label=label, shape=Shape)) +
  geom_point(size=total_genes$ss) +
  geom_label_repel(size=6,
                   min.segment.length = 0,direction = "both",
                   max.overlaps =40,
                   color="black",
                   nudge_x=1,
                   segment.linetype=2) +
  scale_shape_manual(values=c(8, 20))+
  scale_color_manual(values = c("red","gray50","springgreen3","blue"),
                     breaks = c("red","gray50","springgreen3","blue"),
                     labels = c("UP","notDE","DOWN","consensus sig.")) +
  labs(color="Legend")+
  geom_hline(yintercept=-log10(0.05), col="gray", linetype=2)+
  theme_bw()+
  facet_wrap(.~class, scales = "free_y", ncol=2)+
  theme(strip.text = element_text(size = 18, face = "bold"),
      legend.title=element_text(size=16), 
      legend.text=element_text(size=14))+
       guides(color = guide_legend(override.aes = list(size = 4)),
              shape = guide_legend(override.aes = list(size = 4)))

GSEA: GO BP and REACTOME analyses, select processes commonly enriched by all the datasets

genes1 <-as.numeric(total_genes[total_genes$class=="Dataset 1",1])
names(genes1) <-total_genes[total_genes$class=="Dataset 1",3]

genes2 <-as.numeric(total_genes[total_genes$class=="Dataset 2",1])
names(genes2) <-total_genes[total_genes$class=="Dataset 2",3]

genes3 <-as.numeric(total_genes[total_genes$class=="Dataset 3",1])
names(genes3) <-total_genes[total_genes$class=="Dataset 3",3]

list_genes <-list(genes1, genes2, genes3)

list_obj <-vector("list",3)
names(list_obj) <-c("gsebp1","gsebp2","gsebp3")

list_objR <-vector("list",3)
names(list_objR) <-c("ans1","ans2","ans3")


rc <-msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:REACTOME")
sig <-rc[,c(3,4)]
sig$gs_name <-gsub("^REACTOME_","",sig$gs_name)

for(i in 1:length(list_genes)){
  x <-sort(list_genes[[i]], decreasing = TRUE)
  
  gsebp <- gseGO(geneList=x, 
                 ont ="BP", 
                 keyType = "SYMBOL", 
                 pvalueCutoff = 0.15,
                 verbose = TRUE, 
                 minGSSize = 5,
                 OrgDb = hs, 
                 pAdjustMethod = "fdr")
  list_obj[[i]] <-gsebp
  
  gsreac <- clusterProfiler::GSEA(x,TERM2GENE = sig,
                               minGSSize =5, pAdjustMethod = "fdr",
                               pvalueCutoff =0.15)
  list_objR[[i]] <-gsreac
}


rb1=list_obj[[1]]@result
rb1$class <-"Dataset 1"
rb2=list_obj[[2]]@result
rb2$class <-"Dataset 2"
rb3=list_obj[[3]]@result
rb3$class <-"Dataset 3"

rc1=list_objR[[1]]@result
rc1$class <-"Dataset 1"
rc2=list_objR[[2]]@result
rc2$class <-"Dataset 2"
rc3=list_objR[[3]]@result
rc3$class <-"Dataset 3"


table_allBP <-rbind(rb1,rb2,rb3)
table_allBP <-table_allBP[table_allBP$NES > 0,]

table_allRc <-rbind(rc1,rc2,rc3)
table_allRc <-table_allRc[table_allRc$NES > 0,]


sell <-names(table(table_allBP$ID)[table(table_allBP$ID)==3])
sellr <-names(table(table_allRc$ID)[table(table_allRc$ID)==3])


sel_pb <-table_allBP[table_allBP$ID %in% sell,]
sel_rc <-table_allRc[table_allRc$ID %in% sellr,]

sel_pb <-sel_pb[,c(2,5,7,12)]
sel_rc <-sel_rc[,c(1,5,7,12)]

Bubble plot of common enriched processes

ggplot(sel_pb, aes(y =class , x =Description)) + 
  ggtitle("GO: Biological Processes enriched in all the three experiments") +
  geom_point(data=sel_pb,aes(y =class , x =Description, size = NES, colour = -log(p.adjust)), alpha=.8)+
  scale_color_gradient(low="blue",high="red")+
  theme_bw()+ 
  theme(plot.title = element_text(face="bold",size=16),plot.margin = unit(c(2,1,1,2), "cm")) +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1, size=15, face="bold"))+
  theme(axis.text.y = element_text(size=13, face="bold"))+
  labs(x="", y="", size="NES", color="-log10(FDR)")

ggplot(sel_rc, aes(y =class , x =ID)) + 
  ggtitle("REACTOME Pathways Processes enriched in all the three experiments") +
  geom_point(data=sel_rc,aes(y =class , x =ID, size = NES, colour = -log(p.adjust)), alpha=.8)+
  scale_color_gradient(low="blue",high="red")+
  theme_bw()+ 
  theme(plot.title = element_text(face="bold", size=16),plot.margin = unit(c(3,1,1,3), "cm")) +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1, size=11, face="bold"))+
  theme(axis.text.y = element_text(size=11, face="bold"))+
  labs(x="", y="", size="NES", color="-log10(FDR)")